# -*- coding: utf-8 -*-
"""
filter_keywords.py
Reads one or more Excel files, finds the keyword column, filters rows where the
keyword contains any of the target terms, and returns the keyword with its adjacent
search volume column.

Usage:
    python filter_keywords.py file1.xlsx file2.xlsx --out filtered_keywords.xlsx
Optional args:
    --keyword-col "کلمه کلیدی"   # force keyword column name
    --volume-col  "سرچ ولوم"     # force volume column name
    --sheet "Sheet1"              # process only this sheet name (repeatable)
"""

import argparse
import sys
import os
from typing import List, Optional, Tuple

import pandas as pd

# --- Config: target terms to match (normalized & lowercased) ---
TARGET_TERMS = [
    "خرید","بخر","قیمت","ارگانیک","اصل","طبیعی",
    "اورجینال","ارجینال","خالص","سنتی","عطاری",
    "بهترین","برترین","نوع","انواع","فرق","مقایسه","تفاوت", "چند"
]

# Candidate names for auto-detection (normalized & lowercased)
KEYWORD_COL_CANDIDATES = [
    "کلمه کلیدی","کلمه‌کلیدی","کلیدواژه","عبارت","عبارت کلیدی",
    "keyword","query","kw","search term","term"
]
VOLUME_COL_CANDIDATES = [
    "سرچ ولوم","سرچ","میانگین جستجو","حجم جستجو","جستجو",
    "search volume","volume","avg search","sv"
]

def normalize_persian(s: str) -> str:
    """Normalize Persian/Arabic variants, remove extra spaces, make lowercase."""
    if not isinstance(s, str):
        return ""
    s = s.strip()
    # Arabic to Persian forms
    s = (s.replace("ي", "ی")
           .replace("ى", "ی")
           .replace("ك", "ک"))
    # Remove tatweel, normalize spaces
    s = s.replace("ـ", "")
    # Unify half-space variants
    s = s.replace("\u200c", "‌")  # keep ZWNJ but make consistent
    return s.lower()

def detect_columns(df: pd.DataFrame,
                   forced_keyword: Optional[str],
                   forced_volume: Optional[str]) -> Tuple[Optional[str], Optional[str], Optional[int]]:
    """
    Return (keyword_col_name, volume_col_name, keyword_col_index)
    If volume not found by name, will try to use the column to the right of keyword.
    """
    norm_cols = {col: normalize_persian(str(col)) for col in df.columns}

    # Forced names if provided
    kw_col = None
    vol_col = None
    kw_idx = None

    if forced_keyword:
        # match by normalized equality
        target = normalize_persian(forced_keyword)
        for col, ncol in norm_cols.items():
            if ncol == target:
                kw_col = col
                kw_idx = list(df.columns).index(col)
                break

    if forced_volume:
        target = normalize_persian(forced_volume)
        for col, ncol in norm_cols.items():
            if ncol == target:
                vol_col = col
                break

    # Auto-detect keyword column
    if kw_col is None:
        for cand in KEYWORD_COL_CANDIDATES:
            ncand = normalize_persian(cand)
            for col, ncol in norm_cols.items():
                # exact or contains (to be tolerant, e.g., "کلمه کلیدی (fa)")
                if ncol == ncand or ncand in ncol:
                    kw_col = col
                    kw_idx = list(df.columns).index(col)
                    break
            if kw_col:
                break

    # Auto-detect volume column by name
    if vol_col is None:
        for cand in VOLUME_COL_CANDIDATES:
            ncand = normalize_persian(cand)
            for col, ncol in norm_cols.items():
                if ncol == ncand or ncand in ncol:
                    vol_col = col
                    break
            if vol_col:
                break

    return kw_col, vol_col, kw_idx

def filter_rows(df: pd.DataFrame, kw_col: str) -> pd.DataFrame:
    """Keep rows where keyword contains any target term (substring, normalized)."""
    # Precompute normalized keywords series
    norm_kw = df[kw_col].astype(str).map(normalize_persian)

    # Precompute normalized target terms
    targets = [normalize_persian(t) for t in TARGET_TERMS]

    mask = pd.Series(False, index=df.index)
    for t in targets:
        # substring match
        mask = mask | norm_kw.str.contains(t, na=False, regex=False)

    return df[mask]

def process_file(path: str,
                 sheets: Optional[List[str]],
                 forced_keyword: Optional[str],
                 forced_volume: Optional[str]) -> List[pd.DataFrame]:
    """Read an Excel file and return list of filtered DataFrames (per sheet)."""
    results = []

    try:
        xls = pd.ExcelFile(path, engine=None)  # auto-detect engine
    except Exception as e:
        print(f"[WARN] Cannot open '{path}': {e}", file=sys.stderr)
        return results

    sheet_names = sheets if sheets else xls.sheet_names

    for s in sheet_names:
        try:
            df = pd.read_excel(xls, sheet_name=s)
        except Exception as e:
            print(f"[WARN] Cannot read sheet '{s}' in '{path}': {e}", file=sys.stderr)
            continue

        if df.empty:
            continue

        kw_col, vol_col, kw_idx = detect_columns(df, forced_keyword, forced_volume)
        if kw_col is None:
            # Try best guess: the first string-like column
            for col in df.columns:
                if df[col].dtype == object:
                    kw_col = col
                    kw_idx = list(df.columns).index(col)
                    break

        if kw_col is None:
            print(f"[WARN] No keyword column detected in '{path}' -> '{s}'. Skipping.", file=sys.stderr)
            continue

        # If volume col not found by name, try column to the right of keyword
        if vol_col is None and kw_idx is not None:
            if kw_idx + 1 < len(df.columns):
                vol_col = df.columns[kw_idx + 1]

        # Filter rows
        sub = filter_rows(df, kw_col)

        if sub.empty:
            continue

        # Build output with guaranteed columns
        out = pd.DataFrame({
            "File": os.path.basename(path),
            "Sheet": s,
            "Keyword": sub[kw_col].astype(str)
        })

        if vol_col in sub.columns:
            out["SearchVolume"] = sub[vol_col]
        else:
            out["SearchVolume"] = pd.NA

        results.append(out)

    return results

def main():
    parser = argparse.ArgumentParser(description="Filter keywords from Excel files by target terms.")
    parser.add_argument("files", nargs="+", help="Path(s) to .xlsx/.xls file(s)")
    parser.add_argument("--out", default="filtered_keywords.xlsx", help="Output Excel filename")
    parser.add_argument("--keyword-col", dest="keyword_col", default=None, help="Exact keyword column name")
    parser.add_argument("--volume-col", dest="volume_col", default=None, help="Exact search volume column name")
    parser.add_argument("--sheet", dest="sheets", action="append", help="Only process this sheet (repeatable)")
    args = parser.parse_args()

    all_results = []
    for f in args.files:
        all_results.extend(process_file(
            f,
            sheets=args.sheets,
            forced_keyword=args.keyword_col,
            forced_volume=args.volume_col
        ))

    if not all_results:
        print("[INFO] No matching rows found.", file=sys.stderr)
        sys.exit(0)

    final_df = pd.concat(all_results, ignore_index=True)

    # Sort (optional): by File, then by SearchVolume (desc if numeric)
    # Try to coerce SearchVolume to numeric for better sorting
    sv_numeric = pd.to_numeric(final_df["SearchVolume"], errors="coerce")
    final_df.insert(final_df.columns.get_loc("SearchVolume")+1, "SearchVolume_num", sv_numeric)
    final_df = final_df.sort_values(by=["File", "SearchVolume_num"], ascending=[True, False], kind="mergesort")
    final_df = final_df.drop(columns=["SearchVolume_num"])

    # Save to Excel
    try:
        final_df.to_excel(args.out, index=False)
        print(f"[OK] Saved {len(final_df)} rows to '{args.out}'.")
    except Exception as e:
        print(f"[WARN] Could not write '{args.out}': {e}", file=sys.stderr)
        # Fallback to CSV
        csv_out = os.path.splitext(args.out)[0] + ".csv"
        final_df.to_csv(csv_out, index=False)
        print(f"[OK] Saved {len(final_df)} rows to '{csv_out}' instead.")

    # Also print a preview to stdout
    with pd.option_context('display.max_rows', 50, 'display.max_columns', None, 'display.width', 120):
        print(final_df.head(50))

if __name__ == "__main__":
    main()